Objectives
Python packages descriptions:
!pip install vaderSentiment
!pip install wordcloud
# Data Analysis and Wrangling
import numpy as np
import pandas as pd
# machine learning
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
#NLP
import spacy
import nltk
from nltk.corpus import stopwords
import re
#visualization
import matplotlib.pyplot as plt
%matplotlib inline
from wordcloud import WordCloud
#Sentiment analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
# library for LDA visualization
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
from argparse import Namespace
import warnings
warnings.filterwarnings("ignore")
flags = Namespace(
input_file='productReviewShopee_1.csv',
output_file='productReviewShopee_1_sentiment.csv',
topic_file_name = 'Topics_Visualization_',
n_categories=5,
pos_filter = ['NOUN', 'VERB', 'ADJ', 'ADV']
)
import sys
print('The python version is {}.'.format(sys.version))
print('The numpy version is {}.'.format(np.__version__))
print('The pandas version is {}.'.format(pd.__version__))
print('The scikit-learn version is {}.'.format(sklearn.__version__))
print('The spacy version is {}.'.format(spacy.__version__))
print('The nltk version is {}.'.format(nltk.__version__))
Read training dataset into Pandas Dataframe using python pandas packages
df = pd.read_csv(flags.input_file)
print('Dataframe dimensions:', df.shape)
df.head(5)
The data contains the following columns:
For the scope of analysis, the comments column will be used for Topic Modeling
df = df[df['comments'].notnull()]
print('Dataframe dimensions:', df.shape)
df['comments_cleaned'] = df['comments'].apply(lambda x: re.sub(r'[^ \\t\w\.]', '', re.sub(re.compile('{.*?}'), '', x)))
df['comments_cleaned'] = df['comments_cleaned'].apply(lambda x: x.strip())
spaCy is an open-source software library for advanced Natural Language Processing
Lemmatization is the process of converting a word to its base form Use NLP spaCy's lemmatizer: parse the sentence using the loaded model object nlp and extract the lemma for each token.
Examples of lemmatization:
nlp = spacy.load('en', disable=["ner"])
stop_en = stopwords.words('english')
custom_stop_words = ["like", "etc"]
stop_en = stop_en + custom_stop_words
def token_filter(token):
return not (token.is_punct | token.is_space | token.is_stop | token.is_digit
| (not token.pos_ in flags.pos_filter))
%%time
df['comments_lemma'] = df['comments_cleaned'].apply(lambda x: ' '.join([token.lemma_.lower() for token in nlp(x)
if token_filter(token) &
(token.lemma_ != '-PRON-')
& (not token.text.lower() in stop_en)
& (not token.lemma_.lower() in stop_en)]))
Each text should be represented as a vector with length equal to the vocabulary size. Each dimension of this vector corresponds to the occurrence of a word in a text.
CountVectorizer from scikit-learn is used to convert a raw text definitions to a matrix of token counts.
TF-IDF should not be used with LDA algorithm because:
tokenized_data = df[df['comments_lemma'] != '']['comments_lemma'].values
tf_vectorizer = CountVectorizer(max_df=0.5,
min_df=15,
token_pattern='[a-zA-Z]{3,}', # num chars > 3
lowercase = True,
ngram_range=(1,2)
)
vector = tf_vectorizer.fit_transform(tokenized_data)
print("Total number of samples: %d; Total number of features: %d" % vector.shape)
print()
tf_vec = CountVectorizer()
tf = tf_vec.fit_transform(tokenized_data)
tf_st = np.sum(tf,axis=0)
t_st = np.squeeze(np.asarray(tf_st))
term_freq_df_st = pd.DataFrame([t_st], columns=tf_vec.get_feature_names()).transpose()
term_freq_df_st.columns = ['Total']
term_freq_df_st.sort_values('Total', inplace=True, ascending = False)
term_freq_df_st = term_freq_df_st.loc[term_freq_df_st.index.str.isalpha()]
y_pos = np.arange(50)
plt.figure(figsize=(12,10))
plt.bar(y_pos, term_freq_df_st.sort_values(by='Total', ascending=False)['Total'][:50], align='center', alpha=0.5)
plt.xticks(y_pos, term_freq_df_st.sort_values(by='Total', ascending=False)['Total'][:50].index,rotation='vertical')
plt.ylabel('Frequency')
plt.xlabel('Top 50 words')
plt.title('Top 50 words in comments')
tf_vec_2 = CountVectorizer(ngram_range=(2,2))
tf_2 = tf_vec_2.fit_transform(tokenized_data)
tf_st = np.sum(tf_2,axis=0)
t_st = np.squeeze(np.asarray(tf_st))
term_freq_df_st = pd.DataFrame([t_st], columns=tf_vec_2.get_feature_names()).transpose()
term_freq_df_st.columns = ['Total']
term_freq_df_st.sort_values('Total', inplace=True, ascending = False)
y_pos = np.arange(50)
plt.figure(figsize=(12,10))
plt.bar(y_pos, term_freq_df_st.sort_values(by='Total', ascending=False)['Total'][:50], align='center', alpha=0.5)
plt.xticks(y_pos, term_freq_df_st.sort_values(by='Total', ascending=False)['Total'][:50].index,rotation='vertical')
plt.ylabel('Frequency')
plt.xlabel('Top 50 phrases')
plt.title('Top 50 phrases in comments')
pd.value_counts(df['categorie']).plot(kind="bar")
Topic (category) modeling task is based on Latent Dirichlet Allocation (LDA) algorithm. LDA is generative probabilistic model. The model treats documents as a bag-of-words and attempts to find the structure or topics in this collection. LDA groups definitions based on the words that occur in them. As definitions on similar topics tend to use a similar sub-vocabulary, the resulting clusters of documents can be interpreted as different 'categories'. LDA defines each topic as a bag of words. Every definition will have the probability of belonging to a category.
Main parameter for LDA model is a number of topics (categories)
lda = LatentDirichletAllocation(n_components=flags.n_categories,
random_state=0,
verbose=True,
max_iter=10,
batch_size=1000
)
%%time
lda.fit(vector)
# print top words in category
def print_top_words(model, feature_names, n_top_words):
for topic_idx, topic in enumerate(model.components_):
message = "Topic %d: " % topic_idx
message += ", ".join([feature_names[i]
for i in topic.argsort()[:-n_top_words - 1:-1]])
print(message)
print()
print("Top words/phrases in each topic")
print()
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, 10)
LDA topic modeling visualization using pyLDAvis library. Definitions of visual elements in LDA visualization:
pyLDAvis displays N circles. N is number of categories. Each circle represents each category whose areas are set to be proportional to the proportions of the categories across the total tokens in the text (definition). The area of circle for topic is set to be proportional to the estimated number of tokens that were generated by topic across all definitions. The Jensen-Shannon divergence is used to compute distances between topics. The Jensen–Shannon divergence is a method of measuring the similarity between two probability distributions.
Red horizontal bars represent the estimated number of times a given term was generated by a given category. When a category(topic) is selected, the red bars are shown for the 30 most relevant terms for the selected topic.
Blue horizontal bars represent the overall frequency of each term in all definitions.
vis_data = pyLDAvis.sklearn.prepare(lda, vector, tf_vectorizer)
# save LDA visualization to html file on a disk
pyLDAvis.save_html(vis_data, 'Topics_Visualization.html')
Extract topics per category
def get_topics_per_category(category_name):
data = df[df['categorie'] == category_name]['comments_lemma'].values
print("Train model")
tf_vector = CountVectorizer(max_df=0.5,
min_df=15,
token_pattern='[a-zA-Z]{3,}',
lowercase = True,
ngram_range=(1,2)
)
vec = tf_vector.fit_transform(data)
lda_cat = LatentDirichletAllocation(n_components=flags.n_categories,
random_state=0,
verbose=False,
max_iter=10,
batch_size=1000
)
lda_cat.fit(vec)
print("Training was finished")
print()
tf_feature_names = tf_vector.get_feature_names()
print("Top words/phrases in each topic for category: " + category_name)
print()
print_top_words(lda_cat, tf_feature_names, 10)
print("Saving results to the file ")
vis_data = pyLDAvis.sklearn.prepare(lda_cat, vec, tf_vector)
pyLDAvis.save_html(vis_data, flags.topic_file_name+category_name+'.html')
%%time
get_topics_per_category('Bag')
%%time
get_topics_per_category('Men’s fashion')
%%time
get_topics_per_category('Women’s fashion')
VADER (Valence Aware Dictionary and sEntiment Reasoner) Sentiment analyses is a lexicon and rule-based sentiment analysis tool that is specifically attuned to sentiments expressed in social media. VADER uses a combination of A sentiment lexicon is a list of lexical features (e.g., words) which are generally labelled according to their semantic orientation as either positive or negative.
VADER has a lot of advantages over traditional methods of Sentiment Analysis, including:
analyzer = SentimentIntensityAnalyzer()
analyzer.polarity_scores('horrible')
def get_sentiment_score(text):
vs = analyzer.polarity_scores(text)
comp_score = vs['compound']
return comp_score
def get_sentiment_name(score):
if score >= 0.05:
return 'Positive'
elif (score > -0.05):
return 'Neutral'
else:
return 'Negative'
def get_positive_words(text):
positive_words = []
for word in text.split():
vs = analyzer.polarity_scores(word)
pos_score = vs['pos']
if pos_score > 0.05:
positive_words.append(word)
return ' '.join(positive_words)
def get_negative_words(text):
negative_words = []
for word in text.split():
vs = analyzer.polarity_scores(word)
neg_score = vs['neg']
if neg_score > 0.05:
negative_words.append(word)
return ' '.join(negative_words)
%%time
df['sent_score'] = df['comments'].apply(lambda text: get_sentiment_score(text))
df['sentiment'] = df['sent_score'].apply(lambda score: get_sentiment_name(score))
df['positive_words'] = df['comments'].apply(lambda text: get_positive_words(text))
df['negative_words'] = df['comments'].apply(lambda text: get_negative_words(text))
def word_cloud_vis(category_name, sentiment, title):
if category_name == '':
words = ' '.join(df[sentiment].values)
else:
words = ' '.join(df[df['categorie'] == category_name][sentiment].values)
wordcloud = WordCloud(width=800, height=400,
background_color='white', max_words=100,
relative_scaling=0, normalize_plurals = True).generate(words.lower())
plt.figure( figsize=(20,10))
plt.title(title+category_name, fontsize=50)
# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
word_cloud_vis('', 'positive_words', "Positive Sentiment Words")
word_cloud_vis('', 'negative_words', "Negative Sentiment Words")
word_cloud_vis('Bag', 'positive_words', "Positive Sentiment Words for ")
word_cloud_vis('Bag', 'negative_words', "Negative Sentiment Words for ")
word_cloud_vis('Men’s fashion', 'positive_words', "Positive Sentiment Words for ")
word_cloud_vis('Men’s fashion', 'negative_words', "Negative Sentiment Words for ")
word_cloud_vis('Women’s fashion', 'positive_words', "Positive Sentiment Words for ")
word_cloud_vis('Women’s fashion', 'negative_words', "Negative Sentiment Words for ")
from spacy.symbols import nsubj, VERB, dobj
from spacy import displacy
comment_doc = nlp(u"Great value Good product quality Great value")
displacy.render(comment_doc, style="dep", jupyter=True)
global_list = []
for token in comment_doc:
text_list = []
text_list.append(token.text)
text_list.append(token.dep_)
text_list.append(token.head.text)
text_list.append(token.head.pos_)
text_list.append([child for child in token.children])
text_list.append(token.pos_)
text_list.append([child for child in token.head.rights])
text_list.append([child for child in token.head.lefts])
global_list.append(text_list)
df_3 = pd.DataFrame(global_list, columns=["Text", "Dep", "Head text", "Head POS", "Children", "POS", 'rights', 'lefts'])
df_3
def get_aspect_opinion(doc):
def get_name(token):
if (token.pos_ == 'PRON'):
aspect = 'product'
else:
aspect = token.text
return aspect
result = []
for token in doc:
if (token.pos_ in ('NOUN', 'PRON')) & (token.dep_ in ('ROOT', 'dobj', 'compound')):
child_adj = [child.text for child in token.children if (child.pos_ == 'ADJ') | (child.dep_ == 'neg')]
if len(child_adj) > 0:
result.append(get_name(token) + ": " + ' '.join(child_adj))
elif (token.pos_ in ('NOUN', 'PRON')) & (token.dep_ == 'nsubj'):
right_adj = [right.text for right in token.head.rights if (right.pos_ == 'ADJ') | (right.dep_ == 'neg')]
if len(right_adj) > 0:
result.append(get_name(token) + ": " + ' '.join(right_adj))
left_adj = [left.text for left in token.head.lefts if (left.pos_ == 'ADJ') | (left.dep_ == 'neg')]
if len(left_adj) > 0:
result.append(get_name(token) + ": " + ' '.join(left_adj))
child_adj = [child.text for child in token.children if (child.pos_ == 'ADJ') | (child.dep_ == 'neg')]
if len(child_adj) > 0:
result.append(get_name(token) + ": " + ' '.join(child_adj))
elif (token.pos_ == 'ADJ') & (token.head.pos_ == 'ADJ'):
result.append("product: " + token.text)
return '; '.join(result) if len(result) > 0 else ''
%%time
df['aspect_opinion'] = df[df['comments_lemma']!='']['comments_cleaned'].apply(lambda text: get_aspect_opinion(nlp(text)))
df['aspect_opinion'].fillna('',inplace=True)
len(df[df['aspect_opinion'] == ''])
df[['title', 'rating', 'date', 'categorie', 'comments', 'sentiment', 'aspect_opinion']].to_csv(flags.output_file)
VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text
http://comp.social.gatech.edu/papers/icwsm14.vader.hutto.pdf
Latent Dirichlet Allocation